import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib
matplotlib.use('Agg')
from matplotlib.pyplot import plot,savefig
# 用来绘图的，封装了matplot
# 要注意的是一旦导入了seaborn，
# matplotlib的默认作图风格就会被覆盖成seaborn的格式
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#OverallQual:总体评价
class solutions:
    def __init__(self):
        self.data_train=pd.read_csv('train.csv')
        self.data_sample_submission=pd.read_csv('sample_submission.csv')
        self.data_test=pd.read_csv('test.csv')

        pd.set_option('display.width',10000)
        pd.set_option('display.max_columns', None)
    def print(self):
        print(self.data_train.keys())
        print(self.data_train.head())
        print(self.data_train.shape[:])

        # print(self.data_train.head()['Neighborhood'])
        # print(self.data_sample_submission.head())
        # print(self.data_train.head())
        # print(self.data_train.dtypes)
        # print(self.data_train['Alley'].shape)
    def plotdatastest1(self):
        var = 'MSSubClass'
        data = pd.concat([self.data_train['SalePrice'], self.data_train[var]], axis=1)
        fig = sns.boxplot(x=var, y="SalePrice", data=data)
        fig.axis(ymin=0, ymax=800000)
        # print(self.data_train[var])
        plt.show()
        plt.savefig('temp.png')
    def plotdatastest2(self):
        # CentralAir
        data_train=self.data_train
        var = 'CentralAir'
        data = pd.concat([data_train['SalePrice'], data_train[var]], axis=1)
        fig = sns.boxplot(x=var, y="SalePrice", data=data)
        plt.show()
        plt.savefig('temp.png')
    def plotdata(self):
        j=1
        k=100
        for index,row in self.data_train.iteritems():
            plt.figure(j)
            var = index
            data = pd.concat([self.data_train['SalePrice'], self.data_train[var]], axis=1)
            if self.data_train[index].dtype=='object':
               sns.boxplot(x=var,y="SalePrice", data=data)
               plt.savefig("C:\\Users\\LENOVO\\OneDrive\\桌面\\aip\\House_Price\\picture\\boxplot\\{}scatter.jpg".format(index))
            else:
               plt.scatter(x=var, y="SalePrice", data=data)
               plt.savefig("C:\\Users\\LENOVO\\OneDrive\\桌面\\aip\\House_Price\\picture\\scatter\\{}scatter.jpg".format(index))
               plt.figure(k)
               sns.boxplot(x=var, y="SalePrice", data=data)
               plt.savefig("C:\\Users\\LENOVO\\OneDrive\\桌面\\aip\\House_Price\\picture\\boxplot\\{}boxplot.jpg".format(index))
               k+=1
            j+=1
    def dataencoding(self):
        self.newdata = pd.get_dummies(self.data_train)
        self.newdatatest=pd.get_dummies(self.data_test)


    def corrplot(self):
        data_train=self.data_train
        corrmat = data_train.corr()
        print(corrmat)
        f, ax = plt.subplots(figsize=(20, 9))
        sns.heatmap(corrmat, vmax=0.8, square=True)
        plt.savefig("C:\\Users\\LENOVO\\OneDrive\\桌面\\aip\\House_Price\\corr.jpg")
        plt.show()
    def datasweep(self):
        #删除相关系数小于0.3的列。
        data_train = self.newdata
        self.newdata=data_train
        # print(data_train.shape)
        s = data_train.corr()
        s = s['SalePrice']
        for index, row in s.items():
            if row < 0.3 and row > -0.3:
                self.newdata = self.newdata.drop(index, axis=1)
                if index in self.newdatatest.columns:
                    self.newdatatest = self.newdatatest.drop(index,axis=1)

        # print(s)
        self.SalePrice=data_train['SalePrice'].copy()
        self.SalePrice=np.array(self.SalePrice)
        self.newdata=self.newdata.drop('SalePrice',axis=1)
        self.newdata['SalePrice']=1
        self.newdata=self.newdata.fillna(self.newdata.mean())
        self.newdatatest=self.newdatatest.fillna(self.newdatatest.mean())
        self.newdatatest['SalePrice']=1
        self.martix=np.array(self.newdata)
        self.martixtest=np.array(self.newdatatest)
    def olsbysklearn(self):
        from sklearn.linear_model import LinearRegression
        Model=LinearRegression()
        Model.fit(self.martix,self.SalePrice)
        score=Model.score(self.martix,self.SalePrice)
        print("The train score is:{}".format(score))
    def olsregression_gradientdescent(self):
        data_train=self.newdata
        # print(data_train.shape)
        data_train['SalePrice']=data_train['SalePrice'].apply(lambda x:1)
        # colunmnum=data_train.shape[1]
        # indexnum=data_train.shape[0]

        print("shape is:{}".format(self.martix.shape))
        delta=np.zeros([1,self.martix.shape[1]])
        delta=delta[0]
        print("delta is:{}".format(delta))
        # print(delta.shape[:])
        eps=10000
        alpha=0.0000001
        count=0
        print(data_train.head(10))

        while eps>1000:
            rangeJfun=np.zeros([1,data_train.shape[1]])
            rangeJfun = rangeJfun[0]
            self.err_list = []
            for indexj,theta in enumerate(delta):

            #遍历所有的行参数。
                price=0
                # 计算h(x)
                error=0
                for i in range(0,self.martix.shape[0]):
                    t=np.dot(delta,self.martix[i,:])
                    price = price+ (np.dot(delta,self.martix[i,:])-self.SalePrice[i])*self.martix[i,indexj]
                    error = error+(np.dot(delta,self.martix[i,:])-self.SalePrice[i])**2
                error=error/self.martix.shape[0]
                rangeJfun[indexj]=price
                self.err_list.append(error)
            for indexj,theta in enumerate(delta):
                delta[indexj]=delta[indexj]-(alpha*rangeJfun[indexj])/self.martix.shape[0]
            print("count num is:{},error is:{}".format(count,error))
            count+=1
            if count>100:
                self.delta=delta
                break
    def test(self):
        error=0
        mean=self.SalePrice.mean()
        # pre_list=[]
        SSE=0
        for i in range(0,self.martix.shape[0]):
            prediction=np.dot(self.martix[i,:],self.delta)
            error+=(prediction-self.SalePrice[i])**2
            SSE+=(prediction-mean)**2
            # sum+=self.SalePrice[i]**2
            # pre_list.append(prediction)
            print("prediction is :{},initial is:{}".format(prediction,self.SalePrice[i]))
        SST=error+SSE

        # pre_arr=np.array(pre_list)

        plt.plot(np.arange(0,len(self.err_list)),np.array(self.err_list))
        plt.show()
        plt.savefig('C:\\Users\\LENOVO\\OneDrive\\桌面\\aip\\House_Price\\trace.jpg')
        # print("The R-square is :{}".format(np.corr))
        for i in range(0,self.martixtest.shape[0]):
            answer=np.dot(self.martixtest[i,:],self.delta)
            self.data_sample_submission.loc[i,1]=answer
        self.data_sample_submission.to_csv('dataoutput.csv')
        R_square=SSE/SST
        print("R-square is:{}".format(R_square))
        segama=error/(1459-46-1)
        for i in range(self.martix.shape[1]):
            var=self.martix[:,i].var()
            S=np.sqrt(segama/var)
            boundary=self.delta[i]
            print("delta is:{},t is:{}".format(self.delta[i],S))
        # 回归系数的标准误忘了怎么算了，计量经济学没学到家，对不起。

            #计算梯度，进行迭代
s=solutions()
s.dataencoding()
s.print()
# s.corranal()
# s.test()
# s.olsregression()
# s.olsre2()
s.datasweep()
s.olsregression_gradientdescent()
s.test()


